#read in manhattan rides data
manhattan_rides_df <- read_csv("manhattan_rides.csv")

# the code below saves trip_min, gender, age_group variables of manhattan_rides_df
# and converts age_group and gender to factors

trip_dur_age_gender_df <-
  manhattan_rides_df %>% 
  select(trip_min, gender, age_group) %>% 
  mutate(
    age_group = factor(age_group, ordered = T,
                       levels = c("18-25","26-35", "36-45", "46-55", "56-65", "66-85")),
    gender = type.convert(gender, as.is = F)
  )
# looking at the data we can tell there are massive outliers in trip duration
# so i am going to filter them out using the IQR method

Q1 <- quantile(pull(trip_dur_age_gender_df, trip_min), probs = 0.25)
Q3 <- quantile(pull(trip_dur_age_gender_df, trip_min), probs = 0.75)
inter_quart <- IQR(pull(trip_dur_age_gender_df, trip_min))

trip_dur_age_gender_df <-
  trip_dur_age_gender_df %>% 
    filter(
      trip_min >= Q1 - 1.5*inter_quart, 
      trip_min <= Q3 + 1.5*inter_quart
    )
#this generates a plot of boxplots of trip duration (minutes) by gender and age_group
trip_dur_age_gender_df %>% 
  mutate(
    gender = str_to_sentence(gender)
  ) %>% 
  plot_ly(x = ~age_group, y = ~trip_min, color = ~gender, type = "box", colors = "viridis") %>%  
  layout(
    boxmode = "group",
    xaxis = list(title = "Age Range"), 
    yaxis = list(title = "Trip Duration (min)"), 
    legend = list(title = list(text = "<b> Gender </b>"))
  )
# this dataframe groups rides by gender and month
# and provides the average age for each gender in that month
# along with the standard deviation, standard error, lower bound, and upper bound
# which are then used to create a plotly graph where we get average age per month for each gender
# with 95% confidence bands around each line
avg_age_per_month_df <-
  read_csv("manhattan_rides.csv") %>% 
  mutate(
    date = floor_date(as_date(starttime), "month")
  ) %>% 
  select(date, gender, age) %>% 
  group_by(date, gender) %>% 
  summarize(
    total = n(),
    avg_age = mean(age),
    sd_age = sd(age)
  ) %>% 
  mutate(
    sem = sd_age/sqrt(total - 1), 
    lower_bound = avg_age + qt(0.025, df = total - 1) * sem, 
    upper_bound = avg_age - qt(0.025, df = total - 1) * sem
  ) %>% 
  ungroup()

avg_age_plot <-
  avg_age_per_month_df %>% 
  mutate(
    gender = str_to_sentence(gender)
  ) %>% 
  ggplot(aes(x = date, y = avg_age, color = gender)) +
  geom_line(size = 1, alpha = 0.8) +
  geom_ribbon(aes(ymin = lower_bound, ymax = upper_bound), alpha = 0.2)

ggplotly(avg_age_plot) %>% 
  layout(
    xaxis = list(title = "Date"), 
    yaxis = list(title = "Age"), 
    legend = list(title = list(text = "<b> Gender </b>"))
  )